# -*- coding: utf-8 -*-
import codecs
import sys,os
__all__ = ['extract_entitys']


 
'''
<entity enity_id="WKB349" title="GNU自由文档许可证">
    <name>gnu自由文件授权条款</name>
    <name>gnu_free_documentation_license</name>
    <publisher>free_software_foundation,_inc.</publisher>
    <date>当前版本 ：</date>
    <date>2008年11月3日</date>
    <debianapproved>是</debianapproved>
  </entity>

  <entity enity_id="WKB1928921" title="乌石镇_(雷州市)" />
  <entity enity_id="WKB1928922" title="温泉镇_(温县)" />

'''

def extract_entitys(keys_in,entitys_out):
    kb_path = 'E:\\desktop\\wu-request\\NLPCC 2014 Shared Tasks Guidelines\\Chinese Entity Linking  SAMPLE DATA NLPCC2014_EL_sample\\'
    kb_file = kb_path+'PKBase_zhwiki_1_small.xml'

    fid = codecs.open(kb_file, 'r', encoding='utf-8')
    
    entity = None
    num = 0
    line=fid.readline(1000)
    while line:
        line.replace('\r\n',' ')
        line.replace('\n',' ')
        line=line.strip()
        if not line:
            line=fid.readline(1000)
            continue
        #----------------------------------------
        if line[0:7] =='<entity':
            idx1 = line.find('enity_id')
            idx2 = line.find(' ',idx1)
            enity_id = line[idx1:idx2].split('=')[1]
            
            idx1 = line.find('title')
            idx2 = line.find(' ',idx1)
            title = line[idx1:idx2].split('=')[1]
            
            entity = ((enity_id,title),[])
            
            if line[-2:]=='/>':
                num=num+1
                if (num % 1000==0):
                    print 'entity_num=',num,num/378000.0
                
                if enity_id.replace('"','').strip() in keys_in:
                    entitys_out.append(entity)
                entity = None
        elif line[0:9] =='</entity>':
            #print 'read [%s,%s' % (entity[0][0],entity[0][1]),'] finished '
            num=num+1
            if (num % 1000==0):
                print 'entity_num=',num,num/378000.0
            if enity_id.replace('"','').strip() in keys_in:
                entitys_out.append(entity)
            entity = None
        elif not entity is None :
            tmp = line.split('>')
            ver_nm  = tmp[0].replace('<',' ').strip()
            ver_val = tmp[1].split('<')[0]
            entity[1].append((ver_nm,ver_val))
        else:
            print 'skip:',line
        #----------------------------------------
        line=fid.readline(1000)
    fid.close()
    

#keys_in = ['WKB103980','WKB100952','WKB755842','WKB173649']
#entitys_out = []
#extract_entitys(keys_in,entitys_out)
